Introduction

In this notebook, we will explore the mobility signal from Safegraph via Delphi Epidata API in a state-level. We also look at intervention data from State-level social distancing policies in response to the 2019 novel coronavirus in the US. Also, we specifically look at the time frame starting from Feb. 2020 to present. We will cover the following discovery:

  • Mobility signal across states over time

  • Correlation between case count and future mobility signal

  • Lag analysis

    • Other signals correlate with future mobility

    • Other signals correlate with future restaurant visit

  • Variability of policy across states

    • Filtered by mandatory policy

    • Distinct Count of State-wide policy across states

  • Correlation between number of policies and mobility

  • Correlation in Space between number of policies and future mobility

  • Distribution of mobility by various intervention across states

library(ggplot2)
library (readr)
library(tidyverse)
library(dplyr)
library(covidcast)
library(lubridate)
library(ggpubr)
library(zoo)
library(tidyr)
library(gridExtra)
library(cowplot)
library(gplots)
library(car)
library(reshape2)


source("code/painter.r")
source("code/load_all_data.r")
STARTDATE <- "2020-02-20"
ENDDATE <- trunc(Sys.time(), "days")
GEO_TYPE = "state" # state-level
GEO_VALUE = "*" # all states
EXCLUDED_AREAS = c("as","gu", "mp","vi") # excluded areas due to small sample size
DT_X = 7 #   Time shifts to consider for x
data <- load_covidcast_data(STARTDATE, ENDDATE, GEO_TYPE, GEO_VALUE, EXCLUDED_AREAS)


# The fraction of mobile devices that spent more than 6 hours at a 
# location other than their home during the daytime 
# (SafeGraph’s full_time_work_behavior_devices / device_count)
ftime <- data$Full.Time.Mobility

#The fraction of devices that spent between 3 and 6 hours at a location other than their home during the daytime (SafeGraph’s part_time_work_behavior_devices / device_count)
ptime <-data$Part.Time.Mobility

############## New confirmed COVID19 cases ############

# A composite signal from JHU and USA facts
# New confirmed COVID19 cases on average per 7 days
case_avg <- data$Avg.Confirmed.Case.Count

# Cumulative confirmed COVID19 cases on average per 7 days
cum_case <- data$Cum.Avg.Case.Count
# Cumulative confirmed COVID19 cases on average per 7 days, per 100,000 population
cum_case_prop <- data$Cum.Avg.Case.Count.Prop


########### Death cases ######################

# Number of new confirmed deaths due to COVID-19, daily
death_case <- data$Avg.Death.Case.Count

# Cumulative number of confirmed deaths due to COVID-19
cum_death_case <- data$Cum.Avg.Death.Count

# state restaurant visit number 
new_res <- data$Restaurant.Visit.Count

# Get the doctor visit signal
smoothed_cli<- data$smoothed_cli
smoothed_adj_cli<- data$smoothed_adj_cli

Mobility signal across states over time

We can see that full-time away home signal drops across all the states in April and gradually increase. Part-time away home signal also behave with the same trend.

p <- ggplot(ftime, aes(x=time_value, y=value)) +
  geom_line(aes(color = geo_value)) + 
  labs(title = "Full-time away home signal", y= "The fraction of mobile devices that spent more than 6 hours other than home")
p

p <- ggplot(ptime, aes(x=time_value, y=value)) +
  geom_line(aes(color = geo_value)) + 
  labs(title = "Part-time away home signal", y= "The fraction of mobile devices that spent between 3 and 6 hours other than home")

p

Correlation between case count and future mobility signal

Pearson correlation (line chart)

We might be intersted in knowing how case count in the present correlate with mobility signal in the future. We will use full_time_work_prop as the response variable from now on.

Using the dt_x argument in the function covidcast_cor(), we can shift the signal by 7 days forward in time, before calculating correlations. We would like to compare a week forward and 2 weeks forward in time with the present correlation.

We can see that the overall pattern of the correlations are very similar. However, we can see that the correlation increases when the shift is increased.

cor1 <- covidcast_cor(case_avg, ftime, by = "time_value")
cor2 <- covidcast_cor(case_avg, ftime,  by = "time_value", dt_x = DT_X)
cor3 <- covidcast_cor(case_avg, ftime,  by = "time_value", dt_x = 14)


# Stack rowwise into one data frame, then plot time series
all_cor <- rbind(cor1, cor2, cor3)
# Add labels
all_cor$Shift <- as.factor(c(rep(0, nrow(cor1)), rep(DT_X, nrow(cor2)), rep(14, nrow(cor3))))

# Plot the graph
signal_name = "full-time away home"
p <- ggplot(all_cor, aes(x = time_value, y = value)) +
  geom_line(aes(color = Shift)) +
  labs(title = sprintf("Pearson Correlation between case and future %s", signal_name),
       subtitle = "Average per 7 days, over states",
       x = "Date", y = "Correlation") 
p
## Warning: Removed 30 row(s) containing missing values (geom_path).

Spearman correlation (line chart)

scor1 <- covidcast_cor(case_avg, ftime, by = "time_value",  method = "spearman")
scor2 <- covidcast_cor(case_avg, ftime,  by = "time_value", dt_x = DT_X,  method = "spearman")
scor3 <- covidcast_cor(case_avg, ftime,  by = "time_value", dt_x = 14,  method = "spearman")

# Stack rowwise into one data frame, then plot time series
all_scor <- rbind(scor1, scor2, scor3)
# Add labels
all_scor$Shift <- as.factor(c(rep(0, nrow(scor1)), rep(DT_X, nrow(scor2)), rep(10, nrow(scor3))))

# Plot the graph
signal_name = "full-time away home"
p <- ggplot(all_scor, aes(x = time_value, y = value)) +
  geom_line(aes(color = Shift)) +
  labs(title = sprintf("Spearman Correlation between %s and cases", signal_name),
       subtitle = "Average per 7 days, over states",
       x = "Date", y = "Correlation") 
p

Pearson correlation on a map (10-days mobility signal forwarded in time)

# Set a bunch of fields so that the data frame knows how to plot itself
cor3_by_geo <- covidcast_cor(case_avg, ftime,  by = "geo_value", dt_x = 10)

cor3_by_geo$time_value = STARTDATE
cor3_by_geo$issue = STARTDATE
attributes(cor3_by_geo)$geo_type = "state"
class(cor3_by_geo) = c("covidcast_signal", "data.frame")

# Plot choropleth maps, using the covidcast plotting functionality
plot(cor3_by_geo, title = "Correlations between 10-day shifted cases and mobility signal",
     range = c(-1, 1), choro_col = c("orange","lightblue", "purple"))

Lag analysis on covidcast signals

What lag mobility is most correlated with the other signals? We would like to find out this information for selecting the most correlated lag for model building later on.

Other signals correlate with future mobility

Pearson Correlation (slicing by state)

SHIFTDAY <- 100
corr.method <- "pearson"
by_method <- "geo_value"
title <- "Median Pearson correlation between other signals and future mobility (slicing by state)"

covidcastlike.signals <- list(case_avg, cum_case, cum_case_prop, death_case, cum_death_case, smoothed_cli, smoothed_adj_cli)
names <- list("7-day avg. confirmed case", 
             "Cum 7day avg. confirmed case",
             "Cum 7day avg. confirmed case, per 100,000",
             "death case",
             "cumulative death case",
             "doctor visit",
             "doctor visit (day-of-week effects removed)")


# Compute pearson correlation between other covidcast-like signals and mobility
plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              ftime,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Pearson Correlation (slicing by time)

title <- "Median Pearson correlation between other signals and future mobility (slicing by time)"
by_method <- "time_value"


# Compute pearson correlation between other covidcast-like signals and mobility
plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              ftime,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Rank correlation (slicing by state)

title <- "Median Spearman correlation between other signals and future mobility (slicing by state)"
by_method <- "geo_value"
corr.method <- "spearman"

# Compute spearman correlation between other covidcast-like signals and mobility
plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              ftime,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Other signals correlate with future mobility (slicing by time)

title <- "Median Spearman correlation between other signals and future mobility (slicing by time)"
by_method <- "time_value"
corr.method <- "spearman"

# Compute spearman correlation between other covidcast-like signals and mobility
plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              ftime,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Other signals correlate with future restaurant visit

What will happen if we change the mobility signal from staying away from home to restaurant visit?

Pearson correlation (slicing by state)

SHIFTDAY <- 100
corr.method <- "pearson"
by_method <- "geo_value"
title <- "Median Pearson correlation between other signals and future restaurant visit signal (slicing by state)"

# Compute pearson correlation between other covidcast-like signals and restaurant visit
plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              new_res,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Pearson correlation (slicing by time)

by_method <- "time_value"
title <- "Median Pearson correlation between other signals and future restaurant visit signal (slicing by time)"

# Compute pearson correlation between other covidcast-like signals and restaurant visit
plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              new_res,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Rank correlation (slicing by state)

# Do the same for Spearman 
corr.method <- "spearman"
by_method <- "geo_value"
title <- "Median Spearman correlation between other signals and future restaurant visit signal (slicing by state)"

plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              new_res,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Rank correlation (slicing by time)

by_method <- "time_value"
title <- "Median Spearman correlation between other signals and future restaurant visit signal (slicing by time)"

plot.all.Corr.Median.by.shift(covidcastlike.signals, 
                              new_res,
                              SHIFTDAY,
                              names, 
                              corr.method, 
                              title, 
                              by_method)

Variability of policy across states

# Plot for non-distinct count and mandate distribution
policy.by.state <- state.policy[,c("StateName","StatePolicy", "Mandate")]
# Get the count
new_counts <- table(policy.by.state$StatePolicy, policy.by.state$Mandate)

# Convert to data frame
new_counts.df <- as.data.frame(new_counts)
# Change the colname for the future legend readibility
colnames(new_counts.df)[2] <- "Mandate?"

# Plot the graph for the count
p <- ggplot(new_counts.df,aes(x= reorder(Var1,Freq),Freq)) +
  geom_bar(stat ="identity")+
  coord_flip()+
  labs(title = "Count of State-wide Policy Across States", y="Distinct Count", x="Policy")
p

Filtered by mandatory policy

# Show the difference by Mandate?
p <- ggplot(new_counts.df,aes(x= reorder(Var1,Freq),Freq, fill = `Mandate?`)) +
  geom_bar(stat ="identity")+
  coord_flip()+
  labs(title = "Count of State-wide Policy Across States", y="Distinct Count", x="Policy")+
   guides(fill=guide_legend(title="Mandate?"))
p

Distinct Count of State-wide policy across states

# Filter the dataframe by distrinct rows
unique.policy.by.state <- distinct(state.policy[,c("StateName","StatePolicy")])
# Get the count
counts <- table(unique.policy.by.state$StatePolicy)

# Convert to data frame
counts.df <- as.data.frame(counts) 

# Rename the policy for better readibility
counts.df[,"Var1"] <- c("Bar restrictions", "Case-based isolation orders", "Emergency declarations", "Gathering Recommendations", "Gathering Restriction", "Non-essential business closures", "Other Business closures", "Public mask", "Travel-based quarantine order", "Restaurant restrictions", "School closures", "Stay-at-home order", "TravelRestrictEntry", "TravelRestrictExit", "TravelRestrictWithinState")

# Plot the graph for ditinct count
p <- ggplot(counts.df,aes(x= reorder(Var1,Freq),Freq, fill=Freq))+
  geom_bar(stat ="identity")+
  coord_flip()+
  labs(title = "Distinct Count of State-wide Policy Across States", y="Distinct Count", x="Policy")

p

## Lag analysis on number of policies and mobility

Correlation between number of policies and mobility by number of shifts across states

We may want to construct a simple signal to represent state-wide government intervention over time. To do so, we count the number of policies that has been enacted in a day and take the rolling average number within 7 days as an intervention signal.

# Get the dates between start and end date
all.dates <- seq(as.Date(STARTDATE), as.Date(ENDDATE), by="days")
time_value <- sort(rep(all.dates, length(unique(policy$StatePostal)) )) 

# Generate geo_value
geo_value <- rep(unique(policy$StatePostal), length(all.dates))
policy_signal <- data.frame(time_value = time_value, geo_value = geo_value)

# Create empty columns
policy_signal[,unique(policy$StatePolicy)] <- 0

# Fill in the count for each date
  # Get the policy name and state to filer policy signal 
for (row in (1:nrow(policy))){
  current.policy <- policy[row,]$StatePolicy
  current.state <- policy[row,]$StatePostal
  
  if (is.na(policy[row,]$DateEnded)){
    
    # Filter the rows of dataframe to be the current state and the time value that is after the policy is enacted.
    policy_signal[policy_signal$geo_value == current.state & policy_signal$time_value > as.Date(policy[row,]$DateEnacted), current.policy] <- 1
    
  }else{
    # Get time range between Date Enacted and Date Ended
    time.range <- seq(as.Date(policy[row,]$DateEnacted), as.Date(policy[row,]$DateEnded), by = "days")
    
    # Fill in the the rows that are in the current policy and fall between the time arrange to be 1
    policy_signal[policy_signal$time_value %in% time.range & policy_signal$geo_value == current.state, current.policy] <- 1
    }
}

# Compute the sum of the number of policies for every day in the state
policy_signal$total.num.policy <- rowSums(policy_signal[unique(policy$StatePolicy)])

# Compute the average on a 7day sliding window
policy_signal <-policy_signal %>%
    arrange(desc(geo_value)) %>% 
    group_by(geo_value) %>% 
    mutate(num.policy.7avg = rollmean(total.num.policy, k = 7, fill = NA))%>%
    ungroup()

# Finalize the covidcast-like signal for governemnt intervention
covidcast.like.policy.signal <- policy_signal %>% transmute(
  geo_value = geo_value,
  signal = "policy_7dav_num",
  time_value = time_value,
  direction = NA,
  issue = lubridate::today(),
  lag = issue - time_value,
  value = num.policy.7avg,
  stderr = NA,
  sample_size = NA,
  data_source = 'University of Washington')

# Pearson correlation between the number of policies and mobility across states
pearson_policy <- getCorrByShift(150, covidcast.like.policy.signal, data$Full.Time.Mobility, "pearson", "geo_value")

pearson_policy_med <-  getMedian(pearson_policy)

# plot the graph
p <- ggplot(pearson_policy_med , aes(x = dt, y = median)) + geom_line() + geom_point() + labs(title = "Median Pearson correlation between the number of policies and mobility", x = "Shift", y = "Correlation") +
  theme(legend.title = element_blank())

p

# Spearman correlation between the number of policies and mobility across states
spearman_policy <- getCorrByShift(150, covidcast.like.policy.signal,data$Full.Time.Mobility, "spearman", "geo_value")

spearman_policy_med <-  getMedian(spearman_policy)

s<- ggplot(spearman_policy_med, aes(x = dt, y = median)) + geom_line() + geom_point() + labs(title = "Median spearman correlation between the number of policies and mobility", x = "Shift", y = "Correlation") +
  theme(legend.title = element_blank())
s

Correlation in Space between number of policies and future mobility

# Set a bunch of fields so that the data frame knows how to plot itself
ls = list()
idx <- seq(50,125,25)
count <- 1
for (i in idx){
  policy.mobility.cor <- covidcast_cor(covidcast.like.policy.signal, data$Full.Time.Mobility, by = "geo_value", dt_x= i, method = "spearman")

  policy.mobility.cor$time_value = STARTDATE
  policy.mobility.cor$issue = STARTDATE
  attributes(policy.mobility.cor)$geo_type = "state"
  class(policy.mobility.cor) = c("covidcast_signal", "data.frame")

# Plot choropleth maps, using the covidcast plotting functionality
ls[[count]]  <-plot(policy.mobility.cor, title = sprintf("%s-day shifted num of policies and mobility", i), range = c(-1, 1), choro_col = cm.colors(10), alpha = 0.4)
count <- count + 1
}

# Plot all graphs
do.call(grid.arrange,ls)